notebook.community

Edit and run



In [1]:

    
import pandas
import math
import json
from numpy.random import *
from matplotlib import style
style.use('fivethirtyeight')
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['random', 'power', 'bytes']
`%matplotlib` prevents importing * from pylab and numpy



In [2]:

    
sitelinks = pandas.read_csv('snapshot_data/2014-10-13/property_indexes/site_linkss-index.csv',index_col=0).fillna(0)



In [3]:

    
sitelinks.head()









    Out[3]:






  
    
      
      nan
      transgender female
      intersex
      fa'afafine
      transgender male
      female animal
      male animal
      woman
      genderqueer
      female
      male
      kathoey
    
  
  
    
      NaN
       4172
       0
       0
       0
       0
       0
       0
       0
       0
       4027
       16255
       0
    
    
      abwiki
          0
       0
       0
       0
       0
       0
       0
       0
       0
         12
         104
       0
    
    
      acewiki
          4
       0
       0
       0
       0
       0
       0
       0
       0
         52
         200
       0
    
    
      afwiki
        186
       0
       0
       0
       0
       0
       0
       0
       0
        570
        4223
       0
    
    
      afwikiquote
          1
       0
       0
       0
       0
       0
       0
       0
       0
          5
         103
       0

The Wikidata Items that have no sitelinks

This means there are 24,454 humans in Wikidata not connected not in any other Wiki. We don't know the gender for 4,172. 4,027 of them are female, and 16,255 of them are male.

Now we will look at the wikis which have the most and least gender recordings for their human data



In [4]:

    
sitelinks['human_total'] = sitelinks.sum(axis=1)
sitelinks['gendered_total'] = sitelinks['human_total'] - sitelinks['nan']
sitelinks['gendered_per'] = sitelinks['gendered_total'] / sitelinks['human_total']
sitelinks['nonbin_total'] = sitelinks['gendered_total'] - sitelinks['female'] - sitelinks['male']



In [5]:

    
sitelinks[sitelinks['human_total'] > 10000].sort('gendered_per').head()









    Out[5]:






  
    
      
      nan
      transgender female
      intersex
      fa'afafine
      transgender male
      female animal
      male animal
      woman
      genderqueer
      female
      male
      kathoey
      human_total
      gendered_total
      gendered_per
      nonbin_total
    
  
  
    
      jawiki
       103079
       17
       1
       1
       0
       0
       0
       0
       1
       35393
       102361
       1
       240854
       137775
       0.572027
       21
    
    
      zhwiki
        53542
       13
       0
       1
       1
       0
       0
       0
       0
       17550
        55149
       1
       126257
        72715
       0.575928
       16
    
    
      azwiki
         5512
        1
       0
       0
       0
       0
       0
       0
       0
        1509
         8475
       0
        15497
         9985
       0.644318
        1
    
    
      ltwiki
         7383
        0
       0
       0
       0
       0
       0
       0
       0
        2502
        16527
       0
        26412
        19029
       0.720468
        0
    
    
      kowiki
        14052
       20
       1
       0
       2
       0
       0
       0
       2
        9233
        34233
       1
        57544
        43492
       0.755804
       26



In [5]:



In [6]:

    
sitelinks[sitelinks !=  0]









    Out[6]:






  
    
      
      nan
      transgender female
      intersex
      fa'afafine
      transgender male
      female animal
      male animal
      woman
      genderqueer
      female
      male
      kathoey
      human_total
      gendered_total
      gendered_per
      nonbin_total
    
  
  
    
      NaN
        4172
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
        4027
       16255
      NaN
        24454
       20282
       0.829394
      NaN
    
    
      abwiki
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          12
         104
      NaN
          116
         116
       1.000000
      NaN
    
    
      acewiki
           4
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          52
         200
      NaN
          256
         252
       0.984375
      NaN
    
    
      afwiki
         186
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         570
        4223
      NaN
         4979
        4793
       0.962643
      NaN
    
    
      afwikiquote
           1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           5
         103
      NaN
          109
         108
       0.990826
      NaN
    
    
      akwiki
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           3
          21
      NaN
           24
          24
       1.000000
      NaN
    
    
      alswiki
          13
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         241
        1973
      NaN
         2227
        2214
       0.994163
      NaN
    
    
      amwiki
           2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          85
         773
      NaN
          860
         858
       0.997674
      NaN
    
    
      angwiki
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          59
         224
      NaN
          283
         283
       1.000000
      NaN
    
    
      angwikiquote
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         NaN
           3
      NaN
            3
           3
       1.000000
      NaN
    
    
      anwiki
          16
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         656
        3555
      NaN
         4227
        4211
       0.996215
      NaN
    
    
      arcwiki
           1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          13
         125
      NaN
          139
         138
       0.992806
      NaN
    
    
      arwiki
        2537
        3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
        6069
       37904
      NaN
        46513
       43976
       0.945456
        3
    
    
      arwikinews
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           1
          10
      NaN
           11
          11
       1.000000
      NaN
    
    
      arwikiquote
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          13
         164
      NaN
          177
         177
       1.000000
      NaN
    
    
      arwikisource
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           2
          24
      NaN
           26
          26
       1.000000
      NaN
    
    
      arzwiki
         167
        1
      NaN
      NaN
        1
      NaN
      NaN
      NaN
      NaN
        1223
        2566
      NaN
         3958
        3791
       0.957807
        2
    
    
      astwiki
          15
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         319
        1862
      NaN
         2196
        2181
       0.993169
      NaN
    
    
      aswiki
           2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          74
         388
      NaN
          464
         462
       0.995690
      NaN
    
    
      avwiki
           1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         NaN
          45
      NaN
           46
          45
       0.978261
      NaN
    
    
      aywiki
           1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          21
         258
      NaN
          280
         279
       0.996429
      NaN
    
    
      azwiki
        5512
        1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
        1509
        8475
      NaN
        15497
        9985
       0.644318
        1
    
    
      azwikiquote
          12
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          33
         550
      NaN
          595
         583
       0.979832
      NaN
    
    
      azwikisource
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         NaN
          13
      NaN
           13
          13
       1.000000
      NaN
    
    
      barwiki
           2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         155
        1058
      NaN
         1215
        1213
       0.998354
      NaN
    
    
      bat_smgwiki
          60
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         145
         751
      NaN
          956
         896
       0.937238
      NaN
    
    
      bawiki
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          76
         625
      NaN
          701
         701
       1.000000
      NaN
    
    
      bclwiki
           2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          95
         494
      NaN
          591
         589
       0.996616
      NaN
    
    
      be_x_oldwiki
         228
        1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         930
        7808
      NaN
         8967
        8739
       0.974573
        1
    
    
      bewiki
         617
        1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
        1808
       14848
      NaN
        17274
       16657
       0.964282
        1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      vecwikisource
          55
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           1
          75
      NaN
          131
          76
       0.580153
      NaN
    
    
      vepwiki
           4
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         103
         326
      NaN
          433
         429
       0.990762
      NaN
    
    
      vewiki
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           2
           3
      NaN
            5
           5
       1.000000
      NaN
    
    
      viwiki
        4635
        3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
        3147
       14544
      NaN
        22329
       17694
       0.792422
        3
    
    
      viwikiquote
           1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          14
          72
      NaN
           87
          86
       0.988506
      NaN
    
    
      viwikisource
           3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           2
          44
      NaN
           49
          46
       0.938776
      NaN
    
    
      vlswiki
           5
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         135
         730
      NaN
          870
         865
       0.994253
      NaN
    
    
      vowiki
          35
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          70
         595
      NaN
          700
         665
       0.950000
      NaN
    
    
      warwiki
           3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         311
        2394
      NaN
         2708
        2705
       0.998892
      NaN
    
    
      wawiki
           3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         103
         818
      NaN
          924
         921
       0.996753
      NaN
    
    
      wikidatawiki
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         NaN
           1
      NaN
            1
           1
       1.000000
      NaN
    
    
      wowiki
           1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           9
          68
      NaN
           78
          77
       0.987179
      NaN
    
    
      wowikiquote
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           2
           7
      NaN
            9
           9
       1.000000
      NaN
    
    
      wuuwiki
          33
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         167
         307
      NaN
          507
         474
       0.934911
      NaN
    
    
      xalwiki
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           2
          56
      NaN
           58
          58
       1.000000
      NaN
    
    
      xhwiki
           1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           6
          32
      NaN
           39
          38
       0.974359
      NaN
    
    
      xmfwiki
           2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          66
         543
      NaN
          611
         609
       0.996727
      NaN
    
    
      yiwiki
           5
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         157
        1740
      NaN
         1902
        1897
       0.997371
      NaN
    
    
      yowiki
           6
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         686
        5953
      NaN
         6645
        6639
       0.999097
      NaN
    
    
      zawiki
           3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           4
          94
      NaN
          101
          98
       0.970297
      NaN
    
    
      zeawiki
           2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          56
         162
      NaN
          220
         218
       0.990909
      NaN
    
    
      zh_classicalwiki
         153
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          64
         834
      NaN
         1051
         898
       0.854424
      NaN
    
    
      zh_min_nanwiki
          79
        1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         402
        1345
      NaN
         1827
        1748
       0.956760
        1
    
    
      zh_min_nanwikiquote
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         NaN
           6
      NaN
            6
           6
       1.000000
      NaN
    
    
      zh_yuewiki
         311
        3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
         830
        2429
      NaN
         3573
        3262
       0.912958
        3
    
    
      zhwiki
       53542
       13
      NaN
        1
        1
      NaN
      NaN
      NaN
      NaN
       17550
       55149
        1
       126257
       72715
       0.575928
       16
    
    
      zhwikinews
         NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
           1
           3
      NaN
            4
           4
       1.000000
      NaN
    
    
      zhwikiquote
          16
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          70
         571
      NaN
          657
         641
       0.975647
      NaN
    
    
      zhwikisource
          40
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          17
         227
      NaN
          284
         244
       0.859155
      NaN
    
    
      zuwiki
           2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
          47
          97
      NaN
          146
         144
       0.986301
      NaN
    
  

428 rows × 16 columns



In [7]:

    
sitelinks.drop(float('nan'), inplace=True)



In [8]:

    
suffixes = set()

for bits in map(lambda x: x.split('wiki'), sitelinks.index):
    if len(bits) == 3:
        continue
    else:
        pre = bits[0]
        suff = bits[1]
        suffixes.add(suff)



In [9]:

    
suffixes









    Out[9]:





{'', 'news', 'quote', 'source', 'voyage'}



In [10]:

    
def wikityper(wikiname):
    suff = wikiname.split('wiki')[1]
    return 'pedia' if suff == '' else suff

def wikilanger(wikiname):
    pre = wikiname.split('wiki')[0]
    return pre if pre else None
    
sitelinks['wikitype'] = map(wikityper, sitelinks.index)
sitelinks['wikilang'] = map(wikilanger, sitelinks.index)
sitelinks['fem_per'] = sitelinks['female'] / sitelinks['gendered_total']
sitelinks['nonbin_per'] = sitelinks['nonbin_total'] / sitelinks['gendered_total']



In [11]:

    
wikitypes = sitelinks.groupby(by='wikitype')



In [12]:

    
wikitypes.mean()[['female','male','fem_per','nonbin_per','gendered_total']]









    Out[12]:






  
    
      
      female
      male
      fem_per
      nonbin_per
      gendered_total
    
    
      wikitype
      
      
      
      
      
    
  
  
    
      data
          0.000000
           1.000000
       0.000000
       0.000000
           1.000000
    
    
      news
          6.807692
          27.923077
       0.128818
       0.000000
          34.730769
    
    
      pedia
       3185.471223
       16748.082734
       0.162319
       0.000176
       19936.219424
    
    
      quote
        121.641791
         845.567164
       0.085339
       0.000073
         967.343284
    
    
      source
         50.660377
         690.528302
       0.047709
       0.000000
         741.188679
    
    
      voyage
          0.000000
           4.500000
       0.000000
       0.000000
           4.500000

So this means that the female percentage is actually highest on Wikipedias in general at 16% where as for Wikiquote and Wikisource its only 8.5% and 4.7% respectively.



In [13]:

    
lang_map = json.load(open('helpers/wiki_code_map.json','r'))
def lookup_lang(lang):
    try:
            full= lang_map[lang]
            if full.split()[-1].lower() == 'wikipedia':
                return ' '.join(full.split()[:-1])
            else: return full
    except:
        return lang



In [14]:

    
splitpoints = 5

for sort_term, sort_term_text in [('gendered_total', 'number of Gendered Biographies'), ('fem_per', 'percentage of female Biographies')]:
    ssl = sitelinks[sitelinks['wikitype']=='pedia'].sort([sort_term])
    planstep = len(ssl)/float(splitpoints)

    
    for per_type, std_ylim, title_text in [('fem_per', 0.8, 'Female Composition'), ('nonbin_per',0.005, 'Non-binary Gender Percentage')]:

        fig, axes = plt.subplots(nrows=splitpoints, ncols=1, figsize=(12,20))
        plt.subplots_adjust(hspace = 0.8 )
        for splitpoint in range(0,splitpoints):
            begin = int(math.ceil(splitpoint * planstep))
            end = int(math.floor((splitpoint+1) * planstep))

            
            bios_list = ssl.iloc[begin:end]['gendered_total']
            minbio = int(min(bios_list))
            maxbio = int(max(bios_list))
            ratios_list = ssl.iloc[begin:end][per_type]
            maxratio = max(ratios_list)
            minratio = min(ratios_list)

            bios_size = bios_list.apply(lambda x: math.log(x)/math.log(maxbio)) 

            my_colors = [(x, x/2, 0.75) for x in bios_size]
            


            ssl.iloc[begin:end][per_type].plot(ax=axes[splitpoint], kind='bar', color=my_colors)


            axes[splitpoint].set_title(" %s with %s to %s gendered biographies" % (title_text, minbio, maxbio))
            axes[splitpoint].set_ylim((minratio*0.9,maxratio*1.1))
            axes[splitpoint].grid(False)
            axes[splitpoint].yaxis.grid(True, linestyle="--", linewidth=0.3)
            axes[splitpoint].lines[0].set_visible(False)
            axes[splitpoint].yaxis.set_ticks_position('none')
            axes[splitpoint].xaxis.set_ticks_position('none')
            wikilabels = axes[splitpoint].get_xticklabels()
            wikinames = map(lambda x: x.get_text().split('wiki')[0], wikilabels)
            fullnames = map(lookup_lang, wikinames)
            axes[splitpoint].set_xticklabels(fullnames)
        fig.suptitle("""%s of all Wikipedia Languages\n
        ordered by %s. Color is locally relative Wiki Size""" % (title_text, sort_term_text), fontsize=24)    
        plt.show



In [15]:

    
sitelinks[sitelinks['wikitype']=='pedia'].sort(['gendered_total'])['gendered_total'].plot(figsize=(36,6),kind='bar', logy=True)









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f1732bf5690>



In [29]:

    
#find the correct cut off so that we are only inspecting the top TOP wikis by gendered biographies
TOP = 50
scatdata = None
for vartotal in range(0, int(max(sitelinks['gendered_total']))):
    scatdata = sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['gendered_total'] > vartotal)]
    if len(scatdata) > TOP:
        continue
    else:
        print(vartotal)
        break
        
nonbintot = None
for vartotal in range(0, int(max(sitelinks['gendered_total']))):
    nonbintot= sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['nonbin_total'] > vartotal)]
    if len(nonbintot) > TOP:
        continue
    else:
        print(vartotal)
        break



In [31]:

    
scatdata.head()









    Out[31]:






  
    
      
      nan
      transgender female
      intersex
      fa'afafine
      transgender male
      female animal
      male animal
      woman
      genderqueer
      female
      male
      kathoey
      human_total
      gendered_total
      gendered_per
      nonbin_total
      wikitype
      wikilang
      fem_per
      nonbin_per
    
  
  
    
      dewiki
         83
        24
        0
       1
       10
       0
       0
       0
        3
        81249
        462244
       1
        543615
        543532
       0.999847
        39
       pedia
       de
       0.149483
       0.000072
    
    
      enwiki
       2965
       105
       17
       1
       26
       0
       0
       0
       12
       186361
       1015978
       1
       1205466
       1202501
       0.997540
       162
       pedia
       en
       0.154978
       0.000135
    
    
      eswiki
        497
        30
        3
       1
        6
       0
       0
       0
        1
        37889
        199525
       0
        237952
        237455
       0.997911
        41
       pedia
       es
       0.159563
       0.000173
    
    
      frwiki
        313
        33
        3
       0
        5
       0
       0
       0
        3
        62842
        350252
       0
        413451
        413138
       0.999243
        44
       pedia
       fr
       0.152109
       0.000107
    
    
      itwiki
         82
        24
        2
       1
        4
       0
       1
       0
        0
        38753
        230264
       0
        269131
        269049
       0.999695
        32
       pedia
       it
       0.144037
       0.000119



In [30]:

    
scatdata[['gendered_total','fem_per']].to_csv('Magnus Gender analysis/lang_scat.csv')



In [17]:

    
sp =scatdata.plot(kind='scatter', x='gendered_total', y='fem_per', logx=True, figsize=(16,10), c='#e3ae3d')
codes = map(lambda x: str(x).split('wiki')[0], scatdata.index)
fullnames = map(lookup_lang,codes)

sp.set_xlim(min(scatdata['gendered_total']) * 0.85, max(scatdata['gendered_total']) *1.15)
sp.set_ylim(min(scatdata['fem_per']) * 0.95, max(scatdata['fem_per']) *1.05)

sp.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
sp.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))


sp.set_title('Female ratio of biographies, by Wikipedia Language \nTop {} Wikipedias by Biography count\n'.format(TOP), fontsize=24)
sp.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)
sp.set_ylabel('Female ratio of Biographies',  fontsize=18)

(x1, x2), (y1, y2) = sp.get_xlim(), sp.get_ylim()
middle = (x2-x1)/2.0 , (y2-y1)/2.0

f = matplotlib.font_manager.FontProperties()
font1 = f.copy()

#font1.set_weight('light')

for label, x, y in zip(fullnames, scatdata['gendered_total'], scatdata['fem_per']):
    plt.annotate(
        label,
        xy = (x, y), 
        xytext = (3,-3) if label in ['Latvian','Polish','Dutch','Slovak','Hungarian'] else (0,2),
        textcoords = 'offset points', ha = 'center', va = 'bottom',
        fontsize=8, fontproperties=font1)



In [18]:

    
np = scatdata.plot(kind='scatter', x='gendered_total', y='nonbin_per', logx=True, figsize=(16,10), c='#f34141')
codes = map(lambda x: str(x).split('wiki')[0], scatdata.index)
fullnames = map(lookup_lang,codes)

np.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.2%}'.format(x )))
np.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))

np.set_xlim(min(scatdata['gendered_total']) * 0.85, max(scatdata['gendered_total']) *1.15)
np.set_ylim(min(scatdata['nonbin_per']) * 0.95, max(scatdata['nonbin_per']) *1.05)
np.set_title('Nonbinary ratio of biographies, by Wikipedia Language \nTop {} Wikipedias by Biography count\n'.format(TOP), fontsize=24)
np.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)
np.set_ylabel('Nonbinary ratio of Biographies',  fontsize=18)

f = matplotlib.font_manager.FontProperties()
font1 = f.copy()

#font1.set_weight('light')

for label, x, y in zip(fullnames, scatdata['gendered_total'], scatdata['nonbin_per']):
    plt.annotate(
        label,
        xy = (x, y), 
        xytext = (5,-5) if label in ['Czech'] else (0,2),
        textcoords = 'offset points', ha = 'center', va = 'bottom',
        fontsize=8, fontproperties=font1)
    

    
plt.show()


'''
np = nonbintot.plot(kind='scatter', x='nonbin_total', y='nonbin_per', logx=True, figsize=(16,10))
codes = map(lambda x: str(x).split('wiki')[0], nonbintot.index)
fullnames = map(lookup_lang,codes)



np.set_xlim(min(nonbintot['nonbin_total']) * 0.85, max(nonbintot['nonbin_total']) *1.15)
np.set_ylim(min(nonbintot['nonbin_per']) * 0.95, max(nonbintot['nonbin_per']) *1.05)
np.set_title('Nonbinary percentage of biographies, by Wikipedia Language \nTop 50 Wikipedias by Biography count\n', fontsize=24)
np.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)
np.set_ylabel('Non binary percentage of Biographies',  fontsize=18)


for label, x, y in zip(fullnames, nonbintot['nonbin_total'], nonbintot['nonbin_per']):
    plt.annotate(
        label,
        xy = (x, y), xytext = (0,2),
        textcoords = 'offset points', ha = 'center', va = 'bottom',
        fontsize=8)
plt.show()
'''









    












    Out[18]:





"\nnp = nonbintot.plot(kind='scatter', x='nonbin_total', y='nonbin_per', logx=True, figsize=(16,10))\ncodes = map(lambda x: str(x).split('wiki')[0], nonbintot.index)\nfullnames = map(lookup_lang,codes)\n\n\n\nnp.set_xlim(min(nonbintot['nonbin_total']) * 0.85, max(nonbintot['nonbin_total']) *1.15)\nnp.set_ylim(min(nonbintot['nonbin_per']) * 0.95, max(nonbintot['nonbin_per']) *1.05)\nnp.set_title('Nonbinary percentage of biographies, by Wikipedia Language \nTop 50 Wikipedias by Biography count\n', fontsize=24)\nnp.set_xlabel('Number of gendered Biographies, log scale', fontsize=18)\nnp.set_ylabel('Non binary percentage of Biographies',  fontsize=18)\n\n\nfor label, x, y in zip(fullnames, nonbintot['nonbin_total'], nonbintot['nonbin_per']):\n    plt.annotate(\n        label,\n        xy = (x, y), xytext = (0,2),\n        textcoords = 'offset points', ha = 'center', va = 'bottom',\n        fontsize=8)\nplt.show()\n"



In [19]:

    
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(sitelinks[(sitelinks['wikitype']=='pedia')& (sitelinks['gendered_total']>1000)][['fem_per','gendered_total']])









    Out[19]:





PCA(copy=True, n_components=2, whiten=False)



In [20]:

    
int(float(-6.2590421446e-09))









    Out[20]:





0



In [21]:

    
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
logtrans = pandas.DataFrame()
logtrans['nonbin_per'] = sitelinks[(sitelinks['wikitype']=='pedia') & (sitelinks['nonbin_per'] != 0)]['nonbin_per'].apply(math.log)
logtrans['gen_tot'] = sitelinks[(sitelinks['wikitype']=='pedia')& (sitelinks['nonbin_per'] != 0)]['gendered_total'].apply(math.log)
pca.fit(logtrans)









    Out[21]:





PCA(copy=True, n_components=1, whiten=False)



In [22]:

    
print pca.components_, pca.explained_variance_ratio_









    



[[-0.47986084  0.87734461]] [ 0.89353149]



In [23]:

    
sitelinks[['gendered_total','female','male']].corr()









    Out[23]:






  
    
      
      gendered_total
      female
      male
    
  
  
    
      gendered_total
       1.000000
       0.996209
       0.999871
    
    
      female
       0.996209
       1.000000
       0.994684
    
    
      male
       0.999871
       0.994684
       1.000000

	nan	female	male
NaN	4172	4027	16255
abwiki	0	12	104
acewiki	4	52	200
afwiki	186	570	4223
afwikiquote	1	5	103

	nan	transgender female	intersex	fa'afafine	transgender male	genderqueer	female	male	kathoey	human_total	gendered_total	gendered_per	nonbin_total
jawiki	103079	17	1	1	0	1	35393	102361	1	240854	137775	0.572027	21
zhwiki	53542	13	0	1	1	0	17550	55149	1	126257	72715	0.575928	16
azwiki	5512	1	0	0	0	0	1509	8475	0	15497	9985	0.644318	1
ltwiki	7383	0	0	0	0	0	2502	16527	0	26412	19029	0.720468	0
kowiki	14052	20	1	0	2	2	9233	34233	1	57544	43492	0.755804	26

	female	male	fem_per	nonbin_per	gendered_total
wikitype
data	0.000000	1.000000	0.000000	0.000000	1.000000
news	6.807692	27.923077	0.128818	0.000000	34.730769
pedia	3185.471223	16748.082734	0.162319	0.000176	19936.219424
quote	121.641791	845.567164	0.085339	0.000073	967.343284
source	50.660377	690.528302	0.047709	0.000000	741.188679
voyage	0.000000	4.500000	0.000000	0.000000	4.500000

	nan	transgender female	intersex	fa'afafine	transgender male	male animal	genderqueer	female	male	kathoey	human_total	gendered_total	gendered_per	nonbin_total	wikitype	wikilang	fem_per	nonbin_per
dewiki	83	24	0	1	10	0	3	81249	462244	1	543615	543532	0.999847	39	pedia	de	0.149483	0.000072
enwiki	2965	105	17	1	26	0	12	186361	1015978	1	1205466	1202501	0.997540	162	pedia	en	0.154978	0.000135
eswiki	497	30	3	1	6	0	1	37889	199525	0	237952	237455	0.997911	41	pedia	es	0.159563	0.000173
frwiki	313	33	3	0	5	0	3	62842	350252	0	413451	413138	0.999243	44	pedia	fr	0.152109	0.000107
itwiki	82	24	2	1	4	1	0	38753	230264	0	269131	269049	0.999695	32	pedia	it	0.144037	0.000119

	gendered_total	female	male
gendered_total	1.000000	0.996209	0.999871
female	0.996209	1.000000	0.994684
male	0.999871	0.994684	1.000000